library(EnhancedVolcano, verbose = FALSE)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(GEOquery, verbose = FALSE)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma, verbose = FALSE)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(umap, verbose = FALSE)
library(dplyr, verbose = FALSE)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
# Differential expression analysis with limma
# load series and platform data from GEO (date: 2024/10/15)
gset <- getGEO("GSE63127", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE63127_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("X00100011111X00000000000X00000X000000000000X000X00",
"XX00X0XXXXXXXXX1111111111111111111111X111X11111111",
"XXXX1XXXXXXXXXXXXXXXXXXXXXXXX001000000010100111111",
"01110111110011111001110011101011111001110101100011",
"111111111111111111111111111111")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
length(sml) # 182 samples
## [1] 182
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs
# Checking how many samples were excluded
A1_samples <- strsplit("X00100011111X00000000000X00000X000000000000X000X00XX00X0XXXXXXXXX1111111111111111111111X111X11111111XXXX1XXXXXXXXXXXXXXXXXXXXXXXX00100000001010011111101110111110011111001110011101011111001110101100011111111111111111111111111111111", "")[[1]]
length(A1_samples[A1_samples=="X"])
## [1] 48
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization
## Histogram ##
hist(as.matrix(exprs(gset)),
main = "GSE63127 overall distribution of expression values",
xlab = "RMA normalized probe intensity",
ylab = "Frequency") # skewed left, needs log2 transform
# boxplot(exprs(gset),)
# It takes up lots of computation so commenting out for now
max(exprs(gset))
## [1] 136808
min(exprs(gset))
## [1] 0.0657913
# Should do log2 normalization
## log2 normalization ##
exprs(gset) <- log2(exprs(gset)+1)
# quantile normalization: no longer doing this to better capture variation
## exprs(gset) <- normalizeBetweenArrays(exprs(gset))
hist(as.matrix(exprs(gset)),
main = "GSE63127 distribution of expression values",
xlab = "RMA and log2 normalized probe intensity",
ylab = "Frequency") # much better
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE63127", "/", annotation(gset), " value distribution", sep ="")
plotDensities(exprs(gset), group=gs, main=title, legend ="topright")
### Generate a boxplot ###
ex <- exprs(gset)
ord <- order(gs) # order samples by group
palette(
c("#7570B3", "#1B9E77")
)
par(mar=c(7,4,2,1))
title <- "GSE63127 sample-wise distribution of expression values"
boxplot(ex[,ord], boxwex=0.6, notch=T, outline=FALSE, las=2, col=gs[ord],
main=title,
#xlab = "Samples",
ylab = "RMA and log2 normalized probe intensity",
xaxt="n",
whisklty = 0,
#border = gs[ord]
lwd = 0.8,
space = -1
)
# Add axis labels for clarification
mtext("Samples", side=1, line=2) # Add "Samples" as the x-axis label
# Add legend
legend("bottomleft", inset=c(0, -0.3), fill=palette(), legend=groups,
bty="n", xpd=TRUE) # Adjust inset and allow drawing outside plot area
min(exprs(gset))
## [1] 0.09192496
max(exprs(gset))
## [1] 17.0618
phenotypic_data <- pData(gset) # Extract phenotypic data
head(phenotypic_data, n = 3)
## title geo_accession status
## GSM190150 Small airways, non-smoker 029 GSM190150 Public on Dec 16 2008
## GSM190153 Small airways, non-smoker 036 GSM190153 Public on Jun 17 2008
## GSM254157 small airways, smoker 112 GSM254157 Public on Jun 17 2008
## submission_date last_update_date type channel_count
## GSM190150 May 17 2007 Aug 28 2018 RNA 1
## GSM190153 May 17 2007 Aug 28 2018 RNA 1
## GSM254157 Jan 03 2008 Aug 28 2018 RNA 1
## source_name_ch1
## GSM190150 airway epithelial cells obtained by bronchoscopy and brushing
## GSM190153 airway epithelial cells obtained by bronchoscopy and brushing
## GSM254157 airway epithelial cells obtained by bronchoscopy and brushing
## organism_ch1 characteristics_ch1 characteristics_ch1.1
## GSM190150 Homo sapiens age: 34 sex: M
## GSM190153 Homo sapiens age: 45 sex: F
## GSM254157 Homo sapiens age: 45 sex: M
## characteristics_ch1.2 characteristics_ch1.3
## GSM190150 ethnic group: black smoking status: non-smoker
## GSM190153 ethnic group: hispanic smoking status: non-smoker
## GSM254157 ethnic group: white smoking status: smoker, 23 pack-years
## molecule_ch1
## GSM190150 total RNA
## GSM190153 total RNA
## GSM254157 total RNA
## extract_protocol_ch1
## GSM190150 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM190153 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM254157 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## label_ch1
## GSM190150 biotin
## GSM190153 biotin
## GSM254157 biotin
## label_protocol_ch1
## GSM190150 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM190153 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM254157 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## taxid_ch1
## GSM190150 9606
## GSM190153 9606
## GSM254157 9606
## hyb_protocol
## GSM190150 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM190153 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM254157 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## scan_protocol
## GSM190150 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM190153 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM254157 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## description
## GSM190150 small airways, non-smoker
## GSM190153 small airways, non-smoker
## GSM254157 small airways, smoker 112
## data_processing
## GSM190150 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM190153 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM254157 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## platform_id contact_name contact_email
## GSM190150 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM190153 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM254157 GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## contact_laboratory contact_department
## GSM190150 Crystal Department of Genetic Medicine
## GSM190153 Crystal Department of Genetic Medicine
## GSM254157 Crystal Department of Genetic Medicine
## contact_institute contact_address contact_city
## GSM190150 Weill Cornell Medical College 1300 York Avenue New York
## GSM190153 Weill Cornell Medical College 1300 York Avenue New York
## GSM254157 Weill Cornell Medical College 1300 York Avenue New York
## contact_state contact_zip/postal_code contact_country
## GSM190150 NY 10021 USA
## GSM190153 NY 10021 USA
## GSM254157 NY 10021 USA
## supplementary_file
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CEL.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CEL.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CEL.gz
## supplementary_file.1
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CHP.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CHP.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CHP.gz
## data_row_count relation relation.1
## GSM190150 54675 Reanalyzed by: GSE119087
## GSM190153 54675 Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM254157 54675 Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## age:ch1 cilia length:ch1 copd status:ch1
## GSM190150 34 <NA> <NA>
## GSM190153 45 <NA> <NA>
## GSM254157 45 <NA> <NA>
## department of genetic medicine id:ch1 ethnic group:ch1 ethnicity:ch1
## GSM190150 <NA> black <NA>
## GSM190153 <NA> hispanic <NA>
## GSM254157 <NA> white <NA>
## serum 25-oh-d:ch1 sex:ch1 smoking status:ch1 group
## GSM190150 <NA> M non-smoker Non.smoker
## GSM190153 <NA> F non-smoker Non.smoker
## GSM254157 <NA> M smoker, 23 pack-years Smoker
# This is filtered down to the samples that were included.
# I will first try to clean up the phenotypic data.
# The features I want to keep are:
# Dates of submission/updates etc, sex, ethnicity, vitamin D, smoking status, cilia length
# Will not use most of the information but keeping it for the sake of interest)
# Keep the columns that might contain data of interest, which will need to be cleaned up.
# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("geo_accession","status","submission_date","last_update_date","characteristics_ch1","characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","age:ch1","cilia length:ch1","ethnic group:ch1","ethnicity:ch1","serum 25-oh-d:ch1","sex:ch1","smoking status:ch1","group")
# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)
phenotypic_data <- phenotypic_data[,c(indexes)]
# Now I need to parse out sex, ethnicity, smoking status, and age, vitamin D, pack years.
#Rename "group" as "smoking status"
names(phenotypic_data)[16] <- "smoking_status"
## Grabbing ethnicity values from the columns ##
# Initialize a new column "ethnicity" with NA values
phenotypic_data$ethnicity <- NA
# Function to find 'eth' in a row and return the corresponding value
find_ethnicity <- function(row) {
eth_column <- which(grepl('eth', row))
if (length(eth_column) > 0) {
return(row[eth_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "ethnicity" column
phenotypic_data$ethnicity <- apply(phenotypic_data, 1, find_ethnicity)
## Grabbing sex values from the columns ##
# Initialize a new column "sex" with NA values
phenotypic_data$sex <- NA
# Function to find 'sex' in a row and return the corresponding value
find_sex <- function(row) {
sex_column <- which(grepl('sex', row))
if (length(sex_column) > 0) {
return(row[sex_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "sex" column
phenotypic_data$sex <- apply(phenotypic_data, 1, find_sex)
## Grabbing pack_years values from the columns ##
# Initialize a new column "pack_years" with NA values
phenotypic_data$pack_years <- NA
# Function to find 'pack_years' in a row and return the corresponding value, but just the first instance
find_pack_years <- function(row) {
pack_years_column <- which(grepl('pack', row))
if (length(pack_years_column) > 0) {
return(row[pack_years_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "pack_years" column
phenotypic_data$pack_years <- apply(phenotypic_data, 1, find_pack_years)
#unlist the column
phenotypic_data$pack_years <- unlist(phenotypic_data$pack_years )
## Grabbing age values from the columns ##
# Initialize a new column "age" with NA values
phenotypic_data$age <- NA
# Function to find 'age' in a row and return the corresponding value
find_age <- function(row) {
age_column <- which(grepl('age', row))
if (length(age_column) > 0) {
return(row[age_column])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "age" column
phenotypic_data$age <- apply(phenotypic_data, 1, find_age)
## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA
# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
vitamin_d_column <- which(grepl('vitamin', row))
if (length(vitamin_d_column) > 0) {
return(row[vitamin_d_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)
## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA
# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
vitamin_d_column <- which(grepl('vitamin', row))
if (length(vitamin_d_column) > 0) {
return(row[vitamin_d_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)
## Grabbing cilia values from the columns ##
# Initialize a new column "cilia_length" with NA values
phenotypic_data$cilia_length <- NA
# Function to find 'cilia' in a row and return the corresponding value, first instance
find_cilia <- function(row) {
cilia_column <- which(grepl('cilia', row))
if (length(cilia_column) > 0) {
return(row[cilia_column[1]])
} else {
return(NA)
}
}
# Apply the function row-wise to populate the "cilia" column
phenotypic_data$cilia_length <- apply(phenotypic_data, 1, find_cilia)
## Now cut out the messy columns
phenotypic_data <- phenotypic_data[,-c(5:15)]
## Remove unnecessary prefix info
phenotypic_data$ethnicity <- gsub(".*: ", "", phenotypic_data$ethnicity )
phenotypic_data$age <- gsub(".*: ", "", phenotypic_data$age)
phenotypic_data$sex <- gsub(".*: ", "", phenotypic_data$sex)
phenotypic_data$vitamin_d <- gsub(".*: ", "", phenotypic_data$vitamin_d)
phenotypic_data$cilia_length <- gsub(".*: ", "", phenotypic_data$cilia_length)
phenotypic_data$pack_years<- gsub(".*, ", "", phenotypic_data$pack_years)
phenotypic_data$pack_years<- gsub("pack-years", "", phenotypic_data$pack_years)
# Reformat the submission dates to be sortable
phenotypic_data <- phenotypic_data %>%
mutate(submission_date = ifelse(submission_date == "Dec 20 2012", "2012-12-20", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jan 03 2008", "2008-01-08", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jan 31 2013", "2013-01-31", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jun 03 2010", "2010-06-03", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Jun 13 2008", "2008-06-13", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "May 17 2007", "2007-05-17", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Nov 08 2013", "2013-11-08", submission_date)) %>%
mutate(submission_date = ifelse(submission_date == "Nov 10 2014", "2014-11-10", submission_date))
head(phenotypic_data, n = 3)
## geo_accession status submission_date last_update_date
## GSM190150 GSM190150 Public on Dec 16 2008 2007-05-17 Aug 28 2018
## GSM190153 GSM190153 Public on Jun 17 2008 2007-05-17 Aug 28 2018
## GSM254157 GSM254157 Public on Jun 17 2008 2008-01-08 Aug 28 2018
## smoking_status ethnicity sex pack_years age vitamin_d cilia_length
## GSM190150 Non.smoker black M <NA> 34 <NA> <NA>
## GSM190153 Non.smoker hispanic F <NA> 45 <NA> <NA>
## GSM254157 Smoker white M 23 45 <NA> <NA>
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Non-smoker","Smoker"))
levels(gs) <- groups
gset$group <- gs
## Plot PCA 1 ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
# main = "PCA for CS vs NS GSE63127",
col = colz,
pch = 1
)
legend("bottom",
legend = c("Smoker", "Non-smoker"),
col = c("#7570B3", "#1B9E77"), # Colors: only for smoking status
pch = c(15, 15), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1), # Adjust size for better visibility
text.col = "black", # Text color
# bty = "n"
) # Box type: 'n' removes border
## We have 4 definite clusters that are not based on smoking status.
## As such, it is a good idea to check the table of sample phenotypic information to look for sources of variation between samples.
pointz <- as.numeric(as.factor(phenotypic_data$submission_date<= "2010-06-03")) # Get point shape values from date of submission: split into 2010 and earlier, post-2010]
## Plot PCA with date information##
plotMDS(exprs(gset),
gene.selection = "common",
# main = "PCA for CS vs NS GSE63127",
col = colz, # Colors smokers red and nonsmokers black
pch = pointz
#labels = gset$group
)
legend("bottom",
legend = c("Smoker", "Non-smoker",
"2010 and Prior", "Post-2010"),
col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
pch = c(15, 15, 2, 1), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1, 1, 1), # Adjust size for better visibility
text.col = "black", # Text color
# bty = "n"
) # Box type: 'n' removes border
# Clearly the source of batch effect in PC1 is submission date post-2010.
# Note: I found out that the split was at 2010 by doing a bit of experimenting with other clustering methods, not shown here. This PCA simply shows proof that the split occurs at 2010.
# First batch correction (submission date)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(limma)
# Making a batch vector
submission_post_2010_batch <- ifelse(phenotypic_data$submission_date < as.Date("2012-01-01"), 1, 2)
# Adjust the expression matrix for submission date batch effect
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=submission_post_2010_batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Plot PCA for expression values after first batch correction ##
date_corrected_PCA <- plotMDS(exprs_matrix_combat,
gene.selection = "common",
# main = "PCA for CS vs NS GSE63127, corrected for submission date",
col = colz, # Colors smokers red and nonsmokers black
pch = pointz
)
legend("bottom",
legend = c("Smoker", "Non-smoker",
"2010 and Prior", "Post-2010"),
col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
pch = c(15, 15, 2, 1)
#title = "Smoking status"
)
## Some evidence that second source of variation could be due to sex (but only 11/182 samples have sex labels):
plotMDS(exprs_matrix_combat,
gene.selection = "common",
# main = "PCA for CS vs NS GSE63127, corrected for submission date",
col = colz, # Colors smokers red and nonsmokers black
#pch = pointz2 # Using separate shapes for all submission dates
labels = phenotypic_data$sex
)
legend("bottom",
legend = c("Smoker", "Non-smoker", "M = Male", "F = Female"),
col = c("#7570B3", "#1B9E77", "black", "black"),
pch = c(15, 15, NA, NA)
#title = "Smoking status"
)
## Samples are divided by sex, but 11/182 samples is not enough to draw a definitive conclusion here.
## Second correction for unknown source of variation using ComBat: ##
# Assign batch labels based on the first dimension from MDS (equivalent to PC1), since the dividing line for the batches lies at 0
unknown_batch_labels <- ifelse(date_corrected_PCA$x < 0, 1, 2)
# Do a second batch correction
exprs_matrix_combat_2 <- ComBat(dat=exprs_matrix_combat, batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# View PCA plot
plotMDS(exprs_matrix_combat_2,
gene.selection = "common",
main = "PCA for CS vs NS GSE63127 after second correction",
col = colz, # Colors smokers red and nonsmokers black
pch = pointz
#labels = gset$group
)
legend("topleft",
legend = c("Smoker", "Non-smoker",
"2010 and Prior", "Post-2010"),
col = c("#7570B3", "#1B9E77", "black", "black"), # Colors: only for smoking status
pch = c(15, 15, 2, 1), # Shapes: 2 = triangle, 1 = circle
pt.cex = c(1, 1, 1, 1), # Adjust size for better visibility
text.col = "black", # Text color
# bty = "n"
)
## Now PC1 corresponds quite well to smoking status after the two ComBat corrections.
# Finish setting up the design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat_2)
# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)
v$genes <- fData(gset) # attach gene annotations
## Making the processed expression data to save
# vooma_expr <- as.data.frame(v[["E"]])
# vooma_expr <- cbind(Gene = fData(gset)$Gene.symbol, vooma_expr)
# write.table(vooma_expr, "../2_Outputs/2_Airway_expression/A1_exprs_20250403.txt")
# fit linear model
fit <- lmFit(v)
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))
GSE63127_CS_NS_limma <- tT %>%
filter(Gene.symbol != "") %>% # Remove blank gene symbols
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
head(GSE63127_CS_NS_limma)
## # A tibble: 6 × 4
## ID Gene.symbol logFC adj.P.Val
## <chr> <chr> <dbl> <dbl>
## 1 229819_at A1BG -0.106 0.481
## 2 232462_s_at A1BG-AS1 0.531 0.0224
## 3 220951_s_at A1CF 0.302 0.123
## 4 1558450_at A2M 0.110 0.453
## 5 1564139_at A2M-AS1 -0.138 0.0724
## 6 1553505_at A2ML1 0.145 0.636
# Checking for ties
ties <- GSE63127_CS_NS_limma %>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties) # No ties
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
nrow(GSE63127_CS_NS_limma)
## [1] 22189
# For now we are not applying any FDR or log2FC thresholds, that occurs in script 2.1
FDR_cutoff_A1 <- 0.05
log2FC_cutoff_A1 <- 0.25
v_A1 <- EnhancedVolcano::EnhancedVolcano(
toptable = GSE63127_CS_NS_limma,
lab = GSE63127_CS_NS_limma$Gene.symbol,
x = "logFC", # "mean difference" is estimate here
y = "adj.P.Val",
# pCutoffCol = 'min_smoothed_fdr',
xlab = "log2FC",
ylab = "-log10(FDR)",
title = "A1 DEGs",
subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_A1, " FDR cutoff: ", FDR_cutoff_A1),
caption = paste0("Total = ", nrow(GSE63127_CS_NS_limma[abs(GSE63127_CS_NS_limma$logFC)>log2FC_cutoff_A1 & GSE63127_CS_NS_limma$adj.P.Val < FDR_cutoff_A1,]), " significant DEGs meeting cutoffs"),
col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
legendPosition = "bottom",
labSize = 3,
max.overlaps = 3,
drawConnectors = TRUE,
widthConnectors = 0.3,
arrowheads = FALSE,
pCutoff = FDR_cutoff_A1,
FCcutoff = log2FC_cutoff_A1,
gridlines.minor = FALSE,
gridlines.major = FALSE,
xlim = c(-3, 6)
)
v_A1
## Warning: ggrepel: 3577 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Change date suffix as appropriate if changes are made
#write.table(GSE63127_CS_NS_limma, "../2_Outputs/1_Airway_DEGs/GSE63127_CS_NS_limma_20250307.txt", sep = '\t')
# load series and platform data from GEO
gset <- getGEO("GSE7895", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE7895_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- paste0("22222222222222222222200000000000000000000000000000",
"00000000000000000000000111111111111111111111111111",
"1111")
sml <- strsplit(gsms, split="")[[1]]
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("current_smoker","former_smoker","never_smoker"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # Values range 1-15, and 1 big peak around 3.
boxplot(exprs(gset)) # Same range of values, with similar-looking ranges, but not exactly the same
# Narrow range, therefore no log2 normalization needed
exprs(gset) <- normalizeBetweenArrays(exprs(gset))
boxplot(exprs(gset))
# 2024/11/12: I elected to use quantile normalization
min(exprs(gset))
## [1] -0.2140344
max(exprs(gset))
## [1] 14.59784
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for GSE7895",
col = colz,
pch = 1
#labels = gs
)
legend("topright", legend = levels(as.factor(gs)),
fill = unique(colz),
title = "Smoking status")
# No separation
library(stringr)
phenotypic_data <- pData(gset) # Extract phenotypic data
# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("characteristics_ch1.1", "group")
# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)
phenotypic_data <- phenotypic_data[,c(indexes)]
# Extract Age
phenotypic_data$age <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Age:)\\d+"))
# Extract Packyears
phenotypic_data$packyears <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Packyears:)\\d+"))
# Extract Time Since Quit Smoking (months)
phenotypic_data$TSQ_months <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Time Since Quit Smoking \\(months\\):)\\d+"))
# Delete the original column with the unseparated info
phenotypic_data <- phenotypic_data[,-1]
# Convert the NA values for packyears for never smokers to zero (this makes sense since the never smokers have 0 pack years)
phenotypic_data$packyears[phenotypic_data$group=="never_smoker"] <- 0
# Convert the NA values for TSQ_months to zero for current smokers (again makes sense)
phenotypic_data$TSQ_months[phenotypic_data$group=="current_smoker"] <- 0
# Make column to denote just former smoking status for the linear model
phenotypic_data$former_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "former_smoker"))
# Make column to denote just current smoking status for the linear model
phenotypic_data$current_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "current_smoker"))
# Make column to denote just never smoking status for the linear model
phenotypic_data$never_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "never_smoker"))
## Plot PCA using age to define color
# Create a gradient color palette (light blue to dark blue)
palette <- colorRampPalette(c("lightblue", "darkblue"))
## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)] # Map ages to gradient colors
plotMDS(exprs(gset),
gene.selection = "common",
main = "PCA for GSE7895 (darker blue ~ higher age)",
col = colz_age,
pch = 1
)
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age),
fill = palette(2),
title = "Age")
# Does not seem to be an age effect
### Plot PCA of packyears ###
# Excluding packyears of zero (never smokers)
pheno_packyears <- phenotypic_data[phenotypic_data$packyears!=0,]
exprs_packyears <- as.data.frame(exprs(gset)) %>%
dplyr::select(rownames(pheno_packyears))
colz_packyears <- palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_packyears,
gene.selection = "common",
main = "PCA for GSE7895 (darker blue ~ higher packyears)",
col = colz_packyears,
pch = 1
#labels = gs
)
# Add a color bar for packyears
legend("topright", legend = range(pheno_packyears$packyears),
fill = palette(2),
title = "Packyears")
## Does not seem to be packyears effect
### Plot PCA of time since quitting ###
pheno_tsq <- phenotypic_data[!is.na(phenotypic_data$TSQ_months),]
exprs_tsq <- as.data.frame(exprs(gset)) %>%
dplyr::select(rownames(pheno_tsq))
colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)] # Map packyears to gradient colors
plotMDS(exprs_tsq,
gene.selection = "common",
main = "PCA for GSE7895 (darker blue ~ more time since quitting)",
col = colz_TSQ,
pch = 1
#labels = gs
)
legend("topright", legend = range(pheno_tsq$TSQ),
fill = palette(2),
title = "TSQ")
## Does not seem to be TSQ effect
This did not determine the underlying source of variation explaining sample distribution.
v <- vooma(gset, design, plot=T)
v$genes <- fData(gset) # attach gene annotations
# fit linear model
fit <- lmFit(v)
# set up contrasts of interest and recalculate model coefficients
#cts <- c(paste(groups[1],"-",groups[2],sep=""), paste(groups[1],"-",groups[3],sep=""), paste(groups[2],"-",groups[3],sep=""))
#cont.matrix <- makeContrasts(contrasts=cts, levels=design)
cont.matrix <- makeContrasts(
CS_vs_NS = current_smoker - never_smoker,
FS_vs_NS = former_smoker - never_smoker,
CS_vs_FS = current_smoker - former_smoker,
levels = design
)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, proportion = 0.01) # Proportion is "assumed proportion of genes which are differentially expressed"
library(dplyr)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
##
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
##
## scat
## Separate out genes that are DEGS in CS vs NS and FS vs NS
## Note: I have decided not to filter out genes that are significantly different between CS and FS because I realized that doesn't make logical sense.
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr",
p.value=0.05,
lfc=0)
# Note that this already has a p-value cutoff of 0.05 unlike my other datasets, but this is ok because this is the least stringent possible cutoff we will use anyway
# Venn diagram of results
vennDiagram(dT)
# Select the genes differentially expressed in both CS_vs_NS and FS_vs_NS
dT_persistent <- dT %>%
as.data.frame(.) %>%
filter(CS_vs_NS != 0) %>% # Differentially expressed in CS vs NS
filter(CS_vs_NS == FS_vs_NS)# Differentially expressed, same sign in FS vs NS
nrow(dT_persistent) # 128 genes indeed
## [1] 128
# Get the toptable format for all genes
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) # Inf shows all the significant genes
# Filter to the "persistent" genes
tT_persistent <- tT %>%
filter(ID %in% rownames(dT_persistent))
# Filter out blanks, keep lower FDR of ties
tT_persistent <- tT_persistent %>%
filter(Gene.symbol != "") %>% # Remove blank gene symbols
group_by(Gene.symbol) %>%
slice_min(adj.P.Val, with_ties = TRUE) %>%
# For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
ungroup()
nrow(tT_persistent)
## [1] 116
## TO DO: Change this to filtering out duplicates
# Checking for ties
ties <- tT_persistent%>%
group_by(Gene.symbol) %>%
filter(n() > 1) %>%
ungroup()
print(ties)
## # A tibble: 2 × 28
## ID Gene.title Gene.symbol Gene.ID UniGene.title UniGene.symbol UniGene.ID
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 214303… mucin 5AC… MUC5AC 4586 "" "" ""
## 2 214385… mucin 5AC… MUC5AC 4586 "" "" ""
## # ℹ 21 more variables: Nucleotide.Title <chr>, GI <int>,
## # GenBank.Accession <chr>, Platform_CLONEID <lgl>, Platform_ORF <lgl>,
## # Platform_SPOTID <chr>, Chromosome.location <chr>,
## # Chromosome.annotation <chr>, GO.Function <chr>, GO.Process <chr>,
## # GO.Component <chr>, GO.Function.ID <chr>, GO.Process.ID <chr>,
## # GO.Component.ID <chr>, CS_vs_NS <dbl>, FS_vs_NS <dbl>, CS_vs_FS <dbl>,
## # AveExpr <dbl>, F <dbl>, P.Value <dbl>, adj.P.Val <dbl>
# As there is a tie with MUCA5 I will remove the MUCA5 probe with an "x" label for cross-reactivity
#tT_persistent <- tT_persistent %>% filter (ID != "214303_x_at")
#Pick the columns we care about
GSE7895_persistent_DEGs <- tT_persistent %>%
dplyr::select(., Gene.symbol, CS_vs_NS, FS_vs_NS, CS_vs_FS, adj.P.Val) %>%
dplyr::rename(., Gene = Gene.symbol, CS_NS_A2 = CS_vs_NS, FS_NS_A2 = FS_vs_NS, CS_FS_A2 = CS_vs_FS, FDR_A2 = adj.P.Val)
# Save output
#write.table(GSE7895_persistent_DEGs, "../2_Outputs/1_Airway_DEGs/GSE7895_persistent_DEGs_quantile_20250307.txt")
## Filter exprs to the "persistent" genes
exprs_persistent <- as.data.frame(exprs(gset)) %>%
filter(rownames(.) %in% tT_persistent$ID)
## Plot PCA ##
colz<- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs_persistent,
gene.selection = "common",
main = "PCA for GSE7895 with persistent genes",
col = colz,
pch = 1
#labels = gs
)
legend("topright", legend = levels(as.factor(gs)),
fill = unique(colz),
title = "Smoking status")
# You can see more separation happening, but I would expect to see current and former smokers more mixed together, whereas we see former and never smokers more mixed together. Hmm okay, interesting at least.
# Might be good to check on the age, packyears and TSQ here as well?
## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)] # Map ages to gradient colors
plotMDS(exprs_persistent,
gene.selection = "common",
main = "PCA for GSE7895 (darker blue ~ higher age)",
col = colz_age,
pch = 16
)
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age),
fill = palette(2),
title = "Age")
# Does not seem to be an age effect
### Plot PCA of packyears ###
# Excluding packyears of zero (never smokers)
exprs_persistent_packyears <- as.data.frame(exprs_persistent) %>%
dplyr::select(rownames(pheno_packyears))
colz_packyears <- palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_persistent_packyears,
gene.selection = "common",
main = "PCA for GSE7895 persistent genes (darker blue ~ higher packyears)",
col = colz_packyears,
pch = 16
#labels = gs
)
# Add a color bar for packyears
legend("bottomleft", legend = range(pheno_packyears$packyears),
fill = palette(2),
title = "Packyears")
## Maybe some sort of packyears effect happening, not obviously so
### Plot PCA of time since quitting ###
exprs_persistent_tsq <- as.data.frame(exprs_persistent) %>%
dplyr::select(rownames(pheno_tsq))
colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)] # Map packyears to gradient colors
plotMDS(exprs_persistent_tsq ,
gene.selection = "common",
main = "PCA for GSE7895 persistent genes (darker blue ~ more months since quitting)",
col = colz_TSQ,
pch = 16
#labels = gs
)
legend("bottomleft", legend = range(pheno_tsq$TSQ),
fill = palette(2),
title = "TSQ")
## Maybe some TSQ effect but not super obvious